library(ProjectTemplate)
cwd <- getwd()
setwd("../")
load.project()
setwd(cwd)
Initial analysis of the logFC calculation results.
Input data is the tidy data frames generated by 2017-02-01-log-fold-change-df_dev.Rmd.
logFC_MgSeq <- readRDS("../data/logFC_MgSeq_df.rds")
logFC_DESeq2 <- readRDS("../data/logFC_DESeq2_df.rds")
logFC_edgeR <- readRDS("../data/logFC_edgeR_df.rds")
glimpse(logFC_MgSeq)
## Observations: 885,220
## Variables: 9
## $ pipe <chr> "dada2", "dada2", "dada2", "dada2", "dada2", "dad...
## $ sampleID <chr> "E01JH0004", "E01JH0004", "E01JH0004", "E01JH0004...
## $ T1 <dbl> -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -...
## $ T2 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0...
## $ featureNames <chr> "Seq8", "Seq10", "Seq107", "Seq36", "Seq175", "Se...
## $ logFC <dbl> -7.2958916, 3.9019979, -3.5145787, 2.8770745, -2....
## $ se <dbl> 0.3071946, 0.1457152, 0.1444001, 0.1504701, 0.156...
## $ pvalues <dbl> 0.000000e+00, 0.000000e+00, 0.000000e+00, 0.00000...
## $ adjPvalues <dbl> 0.000000e+00, 0.000000e+00, 0.000000e+00, 0.00000...
Subset of data to make it easier to work with
logFC_MgSeq_sub <- logFC_MgSeq %>% filter(!is.na(logFC), !is.na(se)) %>% sample_frac(size = 0.25)
logFC estimate distributions:
For all five of the biological replicates most of the logFC estimates were between -2 and 2 log2.
logFC_MgSeq_sub %>% ggplot() + geom_boxplot(aes(x = pipe, y = logFC, color = sampleID)) +
theme_bw()
logFC estimate distributions:
For all five of the biological replicates the standard error for most log fold-change estimates was less than 1. However, there were some log fold-change estimates with large standard error estimates for all biological replicates.
logFC_MgSeq_sub %>% ggplot() +
geom_boxplot(aes(x = pipe, y = se, color = sampleID)) + theme_bw() + scale_y_log10()
Log fold-change estimates with large standard errors are between -3 and and 2. Most of the features with outlier standard errors are from the QIIME pipeline. However, this is potentially biased as QIIME pipeline had the most features to begin with.
outlier_se <- logFC_MgSeq_sub %>% group_by(sampleID) %>%
mutate(u99 = quantile(se, 0.995)) %>% filter(se > u99)
outlier_se %>% {summary(.$logFC)}
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -3.1140 -0.8714 -0.1597 -0.2952 0.2722 2.0920
outlier_se %>% ggplot() + geom_bar(aes(x=pipe)) + theme_bw()
Features with log fold-change estimates with outlier standard error estimates.
outlier_se %>% group_by(pipe) %>% summarise(n_features = n_distinct(featureNames))
## # A tibble: 3 × 2
## pipe n_features
## <chr> <int>
## 1 dada2 13
## 2 mothur 50
## 3 qiime 63
A number of the features with log fold change standard error estimates that are outliers, were outliers for multiple titration comparisons.
outlier_se %>% group_by(pipe, featureNames) %>% summarise(count = n()) %>% filter(count > 2)
## Source: local data frame [8 x 3]
## Groups: pipe [3]
##
## pipe featureNames count
## <chr> <chr> <int>
## 1 dada2 Seq278 7
## 2 mothur Otu00003 8
## 3 mothur Otu00076 11
## 4 mothur Otu00120 5
## 5 mothur Otu00133 4
## 6 mothur Otu00153 3
## 7 mothur Otu00350 3
## 8 qiime 568118 3
logFC_MgSeq_sub %>%
ggplot() + geom_point(aes(x = logFC, y = se, color = factor(T1)), alpha = 0.25) +
scale_y_log10() + facet_wrap(pipe~sampleID, scale = "free_y") + theme_bw()
The log fold change between adjacent titrations will be smaller as the samples are more similar, this effect increases with titration level. This is evident with narrowing of the point distributions. A naive expectation is that the log fold-change should not be greater than the difference in the sample titation values.
Excluding featues with outlier standard error estimates
logFC_MgSeq_sub %>% group_by(sampleID) %>%
mutate(u99 = quantile(se, 0.995)) %>% filter(se < u99) %>%
ggplot() + geom_point(aes(x = logFC, y = se, color = sampleID, shape = pipe), alpha = 0.25) +
geom_vline(aes(xintercept = 1), color = "grey") +
geom_vline(aes(xintercept = -1), color = "grey") +
facet_grid(T1~T2) + theme_bw()
Separate plots for the different pipelines
logFC_MgSeq_sub %>% group_by(sampleID) %>%
mutate(u99 = quantile(se, 0.995)) %>% filter(se < u99, pipe == "dada2") %>%
ggplot() + geom_point(aes(x = logFC, y = se, color = sampleID, shape = pipe), alpha = 0.25) +
geom_vline(aes(xintercept = 1), color = "grey") +
geom_vline(aes(xintercept = -1), color = "grey") +
facet_grid(T1~T2) + theme_bw()
logFC_MgSeq_sub %>% group_by(sampleID) %>%
mutate(u99 = quantile(se, 0.995)) %>% filter(se < u99, pipe == "mothur") %>%
ggplot() + geom_point(aes(x = logFC, y = se, color = sampleID, shape = pipe), alpha = 0.25) +
geom_vline(aes(xintercept = 1), color = "grey") +
geom_vline(aes(xintercept = -1), color = "grey") +
facet_grid(T1~T2) + theme_bw()
logFC_MgSeq_sub %>% group_by(sampleID) %>%
mutate(u99 = quantile(se, 0.995)) %>% filter(se < u99, pipe == "qiime") %>%
ggplot() + geom_point(aes(x = logFC, y = se, color = sampleID, shape = pipe), alpha = 0.25) +
geom_vline(aes(xintercept = 1), color = "grey") +
geom_vline(aes(xintercept = -1), color = "grey") +
facet_grid(T1~T2) + theme_bw()
### DESeq2 logFC and SE Estimates
glimpse(logFC_DESeq2)
## Observations: 885,220
## Variables: 11
## $ pipe <chr> "dada2", "dada2", "dada2", "dada2", "dada2", "d...
## $ sampleID <chr> "E01JH0004", "E01JH0004", "E01JH0004", "E01JH00...
## $ T1 <dbl> -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,...
## $ T2 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ featureNames <chr> "Seq1", "Seq2", "Seq3", "Seq4", "Seq5", "Seq6",...
## $ baseMean <dbl> 6273.479221, 4428.241244, 334.645867, 1907.1513...
## $ log2FoldChange <dbl> -4.41528734, 2.48603970, -1.97655120, 0.1301318...
## $ lfcSE <dbl> 0.2844473, 0.2082474, 2.6937137, 0.1364468, 2.4...
## $ stat <dbl> -15.5223404, 11.9379161, -0.7337644, 0.9537185,...
## $ pvalue <dbl> 2.449515e-54, 7.508077e-33, 4.630923e-01, 3.402...
## $ padj <dbl> 1.504002e-52, 1.152490e-31, 4.970956e-01, 3.730...
Subset of data to make it easier to work with
## renaming variables for consistency between differential abundance methods
logFC_DESeq2_sub <- logFC_DESeq2 %>% dplyr::rename(logFC = log2FoldChange, se = lfcSE) %>%
filter(!is.na(logFC), !is.na(se)) %>% sample_frac(size = 0.25)
For all five of the biological replicates most of the logFC estimates were between -4 and 4 log2. The DADA2 logFC estimates tend to be larger compare to the pipelines. This could be due to improper clustering of sequences. Where a sequence is incorretly clustered with another sequence that is highly abundant in one sample but only improperly sequences are cluster leading to inflated log fold-change estimates.
logFC_DESeq2_sub %>% ggplot() + geom_boxplot(aes(x = pipe, y = logFC, color = sampleID)) +
theme_bw()
No extremely large standard error estimates like thoes observed for fitFeatureModel. Not sure what to make of the standard error estimates for mothur concentrated around 3.4.
logFC_DESeq2_sub %>% ggplot() +
geom_boxplot(aes(x = pipe, y = se, color = sampleID)) + theme_bw()
logFC_DESeq2_sub %>% group_by(sampleID) %>% filter(pipe == "dada2") %>%
# mutate(u99 = quantile(se, 0.995)) %>% filter(se < u99) %>%
ggplot() + geom_point(aes(x = logFC, y = se, color = sampleID, shape = pipe), alpha = 0.25) +
geom_vline(aes(xintercept = 1), color = "grey") +
geom_vline(aes(xintercept = -1), color = "grey") +
facet_grid(T1~T2) + theme_bw()
logFC_DESeq2_sub %>% group_by(sampleID) %>% filter(pipe == "mothur") %>%
# mutate(u99 = quantile(se, 0.995)) %>% filter(se < u99) %>%
ggplot() + geom_point(aes(x = logFC, y = se, color = sampleID, shape = pipe), alpha = 0.25) +
geom_vline(aes(xintercept = 1), color = "grey") +
geom_vline(aes(xintercept = -1), color = "grey") +
facet_grid(T1~T2) + theme_bw()
logFC_DESeq2_sub %>% group_by(sampleID) %>% filter(pipe == "qiime") %>%
# mutate(u99 = quantile(se, 0.995)) %>% filter(se < u99) %>%
ggplot() + geom_point(aes(x = logFC, y = se, color = sampleID, shape = pipe), alpha = 0.25) +
geom_vline(aes(xintercept = 1), color = "grey") +
geom_vline(aes(xintercept = -1), color = "grey") +
facet_grid(T1~T2) + theme_bw()
edgeR did not calculate the standard error for the log fold-change
glimpse(logFC_edgeR)
## Observations: 885,220
## Variables: 25
## $ pipe <chr> "dada2", "dada2", "dada2", "dada2", "dada2", "dada2"...
## $ sampleID <chr> "E01JH0004", "E01JH0004", "E01JH0004", "E01JH0004", ...
## $ T1 <dbl> -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, ...
## $ T2 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0...
## $ OTUname <chr> "Seq88", "Seq13", "Seq86", "Seq30", "Seq18", "Seq46"...
## $ Sequence <chr> "Seq88", "Seq13", "Seq86", "Seq30", "Seq18", "Seq46"...
## $ logFC <dbl> -12.022005, 11.418198, -11.780843, 10.997541, 11.400...
## $ logCPM <dbl> 14.98450, 14.43363, 14.74363, 14.01374, 14.41588, 12...
## $ PValue <dbl> 7.266076e-40, 1.900690e-37, 1.857515e-36, 1.697865e-...
## $ FDR <dbl> 3.894617e-37, 5.093848e-35, 3.318761e-34, 2.275139e-...
## $ Rank1 <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, ...
## $ Rank2 <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, ...
## $ Rank3 <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, ...
## $ Rank4 <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, ...
## $ Rank5 <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, ...
## $ Rank6 <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, ...
## $ Rank7 <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, ...
## $ Rank8 <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, ...
## $ taxonomy1 <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, ...
## $ taxonomy2 <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, ...
## $ taxonomy3 <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, ...
## $ taxonomy4 <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, ...
## $ taxonomy5 <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, ...
## $ taxonomy6 <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, ...
## $ taxonomy7 <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, ...
Subset of data to make it easier to work with
## renaming variables for consistency between differential abundance methods
## Subsetting for features of interest
logFC_edgeR_sub <- logFC_edgeR %>%
select(pipe, sampleID, T1, T2, OTUname, logFC) %>% dplyr::rename(featureName = OTUname) %>%
filter(!is.na(logFC)) %>% sample_frac(size = 0.25)
For all five of the biological replicates most of the logFC estimates were between -2 and 2 log2. The DADA2 logFC estimates tend to be larger compare to the pipelines. This could be due to improper clustering of sequences. Where a sequence is incorretly clustered with another sequence that is highly abundant in one sample but only improperly sequences are cluster leading to inflated log fold-change estimates.
logFC_edgeR_sub %>% ggplot() +
geom_hline(aes(yintercept = -2)) + geom_hline(aes(yintercept = 2)) +
geom_boxplot(aes(x = pipe, y = logFC, color = sampleID)) +
theme_bw()
logFC_edgeR_sub %>% ggplot() +
geom_hline(aes(yintercept = -2)) + geom_hline(aes(yintercept = 2)) +
geom_boxplot(aes(x = pipe, y = logFC)) +
theme_bw() + facet_grid(T1~T2)
Can only compare differential abundance methods by pipeline as the features are not directly comparable, can look into comparing by taxonomic classification later.
MgSeq <- logFC_MgSeq %>% filter(!is.na(logFC), !is.na(logFC)) %>%
select(pipe, sampleID, T1, T2, featureNames, logFC) %>%
dplyr::rename(logFC_MgSeq = logFC)
DSeq <- logFC_DESeq2 %>% dplyr::rename(logFC = log2FoldChange) %>%
select(pipe, sampleID, T1, T2, featureNames, logFC) %>%
dplyr::rename(logFC_DESeq2 = logFC)
edge <- logFC_edgeR %>% dplyr::rename(featureNames = OTUname) %>%
select(pipe, sampleID, T1, T2, featureNames, logFC) %>%
dplyr::rename(logFC_edgeR = logFC)
diff_abu_comp <- MgSeq %>% inner_join(DSeq) %>% inner_join(edge)
## Joining, by = c("pipe", "sampleID", "T1", "T2", "featureNames")
## Joining, by = c("pipe", "sampleID", "T1", "T2", "featureNames")
Only 66K features had log fold change estimates for all three differential abundance methods. No features were called for all
diff_abu_comp %>% ggplot() +
geom_abline(aes(intercept = 0, slope = 1), linetype = 2) +
geom_point(aes(x = logFC_MgSeq, y = logFC_DESeq2, fill = sampleID),
shape = 21, color = "white", alpha = 0.25) +
facet_grid(.~pipe)
diff_abu_comp %>% ggplot() +
geom_abline(aes(intercept = 0, slope = 1), linetype = 2) +
geom_point(aes(x = logFC_MgSeq, y = logFC_edgeR, fill = sampleID),
shape = 21, color = "white", alpha = 0.25) +
facet_grid(.~pipe)
diff_abu_comp %>% ggplot() +
geom_abline(aes(intercept = 0, slope = 1), linetype = 2) +
geom_point(aes(x = logFC_DESeq2, y = logFC_edgeR, fill = sampleID),
shape = 21, color = "white", alpha = 0.25) +
facet_grid(.~pipe)
## Downsample for more manageable dataset
diff_abu_comp %>% sample_frac(0.25) %>%
gather("diff_abu", "logFC", -pipe, -sampleID, -T1, -T2,
-featureNames,factor_key = TRUE) %>%
mutate(group_set = paste(pipe, sampleID, T1, T2, featureNames)) %>%
ggplot() +
geom_point(aes(x = diff_abu, y = logFC)) +
geom_line(aes(x = as.numeric(diff_abu), y = logFC, group = group_set), alpha = 0.05) +
facet_wrap(~pipe)
## only features with logFC > +/- 1
diff_abu_comp %>% filter(logFC_MgSeq < -1 | logFC_MgSeq > 1 |
logFC_DESeq2 < -1 | logFC_DESeq2 > 1 |
logFC_edgeR < -1 | logFC_edgeR > 1) %>% #sample_frac(0.25) %>%
gather("diff_abu", "logFC", -pipe, -sampleID, -T1, -T2,
-featureNames,factor_key = TRUE) %>%
mutate(group_set = paste(pipe, sampleID, T1, T2, featureNames)) %>%
ggplot() +
geom_point(aes(x = diff_abu, y = logFC)) +
geom_line(aes(x = as.numeric(diff_abu), y = logFC, group = group_set), alpha = 0.05) +
facet_wrap(~pipe)